{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Python List Comprehension\n",
    "\n",
    "Note: The run times might be different from those presented in article because the codes were re-ran on a different machine which is much faster than the older one. But the trend should be the same."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## What is LC?\n",
    "\n",
    "Analogous to set-builder form"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[x**2 for x in range(0,10)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[2, 4, 6, 8, 10, 12, 14, 16, 18]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[x for x in range(1,20) if x%2==0 ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['A', 'E', 'A', 'I']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[x for x in 'MATHEMATICS' if x in ['A','E','I','O','U']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\n",
      "4\n",
      "9\n",
      "16\n",
      "25\n",
      "36\n",
      "49\n",
      "64\n",
      "81\n",
      "100\n"
     ]
    }
   ],
   "source": [
    "for i in range(1,101):\n",
    "    if int(i**0.5)==i**0.5:\n",
    "        print i"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[i for i in range(1,101) if int(i**0.5)==i**0.5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Eg1: Matrix Flatten:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14]]\n"
     ]
    }
   ],
   "source": [
    "# matrix = [ range(0,5), range(5,10), range(10,15) ]\n",
    "# print matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def eg1_for(matrix):\n",
    "    flat = []\n",
    "    for row in matrix:\n",
    "        for x in row:\n",
    "            flat.append(x)\n",
    "    return flat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def eg1_lc(matrix):\n",
    "    return [x for row in matrix for x in row ]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original Matrix: [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14]]\n",
      "FOR-loop result: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]\n",
      "LC result      : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]\n"
     ]
    }
   ],
   "source": [
    "matrix = [ range(0,5), range(5,10), range(10,15) ]\n",
    "print \"Original Matrix: \" + str(matrix)\n",
    "print \"FOR-loop result: \" + str(eg1_for(matrix))\n",
    "print \"LC result      : \" + str(eg1_lc(matrix))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100000 loops, best of 3: 2.95 µs per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit eg1_for(matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1000000 loops, best of 3: 1.35 µs per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit eg1_lc(matrix)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Eg2: Removing vowels from a sentence:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'My nm s Arshy Jn!'"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def eg2_for(sentence):\n",
    "    vowels = 'aeiou'\n",
    "    filtered_list = []\n",
    "    for l in sentence:\n",
    "        if l not in vowels:\n",
    "            filtered_list.append(l)\n",
    "    return ''.join(filtered_list)\n",
    "eg2_for('My name is Aarshay Jain!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'My nm s Arshy Jn!'"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def eg2_lc(sentence):\n",
    "    vowels = 'aeiou'\n",
    "    return ''.join([ l for l in sentence if l not in vowels])\n",
    "eg2_for('My name is Aarshay Jain!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "FOR-loop result: My nm s Arshy Jn!\n",
      "LC result      : My nm s Arshy Jn!\n"
     ]
    }
   ],
   "source": [
    "sentence = 'My name is Aarshay Jain!'\n",
    "print \"FOR-loop result: \" + eg2_for(sentence)\n",
    "print \"LC result      : \" + eg2_lc(sentence)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100000 loops, best of 3: 4 µs per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit eg2_for('My name is Aarshay Jain!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100000 loops, best of 3: 2.49 µs per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit eg2_lc('My name is Aarshay Jain!')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Eg3: Dictionary Comprehension:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "country = ['India', 'Pakistan', 'Nepal', 'Bhutan', 'China', 'Bangladesh']\n",
    "capital = ['New Delhi', 'Islamabad','Kathmandu', 'Thimphu', 'Beijing', 'Dhaka']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Bangladesh': 'Dhaka',\n",
       " 'Bhutan': 'Thimphu',\n",
       " 'China': 'Beijing',\n",
       " 'India': 'New Delhi',\n",
       " 'Nepal': 'Kathmandu',\n",
       " 'Pakistan': 'Islamabad'}"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def eg3_for(keys, values):\n",
    "    dic = {}\n",
    "    for i in range(len(keys)):\n",
    "        dic[keys[i]] = values[i]\n",
    "    return dic\n",
    "eg3_for(country,capital)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Bangladesh': 'Dhaka',\n",
       " 'Bhutan': 'Thimphu',\n",
       " 'China': 'Beijing',\n",
       " 'India': 'New Delhi',\n",
       " 'Nepal': 'Kathmandu',\n",
       " 'Pakistan': 'Islamabad'}"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def eg3_lc(keys, values):\n",
    "    return { keys[i] : values[i] for i in range(len(keys)) }\n",
    "eg3_lc(country,capital)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "FOR-loop result: {'Pakistan': 'Islamabad', 'Bangladesh': 'Dhaka', 'Bhutan': 'Thimphu', 'Nepal': 'Kathmandu', 'India': 'New Delhi', 'China': 'Beijing'}\n",
      "LC result      : {'Pakistan': 'Islamabad', 'Bangladesh': 'Dhaka', 'Bhutan': 'Thimphu', 'Nepal': 'Kathmandu', 'India': 'New Delhi', 'China': 'Beijing'}\n"
     ]
    }
   ],
   "source": [
    "country = ['India', 'Pakistan', 'Nepal', 'Bhutan', 'China', 'Bangladesh']\n",
    "capital = ['New Delhi', 'Islamabad','Kathmandu', 'Thimphu', 'Beijing', 'Dhaka']\n",
    "print \"FOR-loop result: \" + str(eg3_for(country, capital))\n",
    "print \"LC result      : \" + str(eg3_lc(country, capital))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1000000 loops, best of 3: 1.57 µs per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit eg3_for(country,capital)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100000 loops, best of 3: 2.16 µs per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit eg3_lc(country,capital)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Additional examples (mentioned as exercise for users)\n",
    "\n",
    "### Eg: Prime numbers:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97]\n",
      "1000 loops, best of 3: 197 µs per loop\n"
     ]
    }
   ],
   "source": [
    "#FOR:\n",
    "def eg4_for(N):\n",
    "    non_primes = [] \n",
    "    for i in range(2,int(N**0.5)+1):\n",
    "        for j in range(i,N,i):\n",
    "#             print j\n",
    "            non_primes.append(j)\n",
    "    primes = []\n",
    "    for i in range(2,N):\n",
    "        if i not in non_primes:\n",
    "            primes.append(i)\n",
    "    return primes\n",
    "print eg4_for(100)\n",
    "%timeit eg4_for(100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97]\n",
      "10000 loops, best of 3: 186 µs per loop\n"
     ]
    }
   ],
   "source": [
    "#LC:\n",
    "def eg4_lc(N):\n",
    "    non_primes = [ j for i in range(2,int(N**0.5)+1) for j in range(i,N,i)]\n",
    "    return [ i for i in range(2,N) if i not in non_primes]\n",
    "print eg4_lc(100)\n",
    "%timeit eg4_lc(100)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Eg: Matrix Multiplication:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]] [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]\n"
     ]
    }
   ],
   "source": [
    "mat1 = [ range(0,5), range(5,10) ]\n",
    "mat2 = [ range(0,2), range(2,4), range(4,6), range(6,8), range(8,10) ]\n",
    "print mat1 , mat2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[60, 70], [160, 195]]\n",
      "100000 loops, best of 3: 10.1 µs per loop\n"
     ]
    }
   ],
   "source": [
    "def eg2_for(mat1, mat2):\n",
    "    mat1_row = len(mat1)\n",
    "    mat2_row = len(mat2) #also num of col of mat1\n",
    "    mat2_col = len(mat2[0])\n",
    "    matm2 = [ [0]*mat2_col for i in range(mat1_row) ]\n",
    "    for row in range(mat1_row):\n",
    "        for col in range(mat2_col):\n",
    "            for i in range(mat2_row):\n",
    "                matm2[row][col] += (mat1[row][i]*mat2[i][col])\n",
    "    return matm2\n",
    "print eg2_for(mat1,mat2)\n",
    "%timeit eg2_for(mat1,mat2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[60, 70, 160, 195]\n",
      "100000 loops, best of 3: 6.71 µs per loop\n"
     ]
    }
   ],
   "source": [
    "def eg2_lc(mat1, mat2):\n",
    "    mat1_row = len(mat1)\n",
    "    mat2_row = len(mat2) #also num of col of mat1\n",
    "    mat2_col = len(mat2[0])\n",
    "    matm = [ sum( [mat1[row][i]*mat2[i][col] for i in range(mat2_row)] ) for row in range(mat1_row) for col in range(mat2_col)  ]\n",
    "    return matm\n",
    "print eg2_lc(mat1,mat2)\n",
    "%timeit eg2_lc(mat1,mat2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100000 loops, best of 3: 9.29 µs per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit eg2_for(mat1,mat2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100000 loops, best of 3: 9.7 µs per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit eg2_lc(mat1,mat2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Eg: Find all possible triangles:\n",
    "We are given an integer N and we have to find all possible triangles with unique lengths that can be formed using side lengths <=N. Let's compare the for and LC cases: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def tri_for(N):\n",
    "    L=[]\n",
    "    for i in range(1,N-2):\n",
    "        for j in range(i+1,N-1):\n",
    "            for k in range(j+1, N):\n",
    "                if (i+j<k) | (i+k<j) | (j+k<i):\n",
    "                    L.append((i,j,k))\n",
    "    return L            \n",
    "\n",
    "def tri_lc(N):\n",
    "    return [(i,j,k) for i in range(1,N-2) for j in range(i+1,N-1) for k in range(j+1,N) if ((i+j<k) | (i+k<j) | (j+k<i))]\n",
    "#     [ (i,j,k) for i in range(1,N-2) for j in range(i+1,N-1) for k in range(j+1,N) ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(1, 2, 4), (1, 2, 5), (1, 2, 6), (1, 2, 7), (1, 2, 8), (1, 2, 9), (1, 3, 5), (1, 3, 6), (1, 3, 7), (1, 3, 8), (1, 3, 9), (1, 4, 6), (1, 4, 7), (1, 4, 8), (1, 4, 9), (1, 5, 7), (1, 5, 8), (1, 5, 9), (1, 6, 8), (1, 6, 9), (1, 7, 9), (2, 3, 6), (2, 3, 7), (2, 3, 8), (2, 3, 9), (2, 4, 7), (2, 4, 8), (2, 4, 9), (2, 5, 8), (2, 5, 9), (2, 6, 9), (3, 4, 8), (3, 4, 9), (3, 5, 9)]\n",
      "10000 loops, best of 3: 38.3 µs per loop\n"
     ]
    }
   ],
   "source": [
    "print tri_for(10)\n",
    "%timeit tri_for(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(1, 2, 4), (1, 2, 5), (1, 2, 6), (1, 2, 7), (1, 2, 8), (1, 2, 9), (1, 3, 5), (1, 3, 6), (1, 3, 7), (1, 3, 8), (1, 3, 9), (1, 4, 6), (1, 4, 7), (1, 4, 8), (1, 4, 9), (1, 5, 7), (1, 5, 8), (1, 5, 9), (1, 6, 8), (1, 6, 9), (1, 7, 9), (2, 3, 6), (2, 3, 7), (2, 3, 8), (2, 3, 9), (2, 4, 7), (2, 4, 8), (2, 4, 9), (2, 5, 8), (2, 5, 9), (2, 6, 9), (3, 4, 8), (3, 4, 9), (3, 5, 9)]\n",
      "10000 loops, best of 3: 36.2 µs per loop\n"
     ]
    }
   ],
   "source": [
    "print tri_lc(10)\n",
    "%timeit tri_lc(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The Time Advantage\n",
    "\n",
    "### Map Function Review:\n",
    "Used to apply a function to each element of a list or any other iterable. \n",
    "Syntax: map(function, iterable)\n",
    "For example, we can multiply each element of a list of integers with the next number."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0, 2, 6, 12, 20, 30, 42, 56, 72, 90]"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "arr = range(10) #contains [0,1,...,9]\n",
    "map(lambda x: x*(x+1), arr)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we have used the Python temporary function lambda. This can be replaced with a standard Python function or a user-defined function declared earlier."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### A simple example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]\n"
     ]
    }
   ],
   "source": [
    "#Method 1: For-Loop\n",
    "def square_for(arr):\n",
    "    result = []\n",
    "    for i in arr:\n",
    "        result.append(i**2)\n",
    "    return result\n",
    "print square_for(range(1,11))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]\n"
     ]
    }
   ],
   "source": [
    "#Method 2: Map Function\n",
    "def square_map(arr):\n",
    "    return map(lambda x: x**2, arr)\n",
    "print square_map(range(1,11))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]\n"
     ]
    }
   ],
   "source": [
    "#Method 3: List comprehension:\n",
    "def square_lc(arr):\n",
    "    return [i**2 for i in arr]\n",
    "print square_lc(range(1,11))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Though the three techniques produce the same result, we can see that LC is the most elegant and readable technique. You might argue that even the map function is not bad in this case. But map has its own limitations which are not evident in this example.\n",
    "\n",
    "### Taking a step forward\n",
    "Let's include a catch here. What if we want the square of only even numbers in the list? The three functions would look like:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[4, 16, 36, 64, 100]\n"
     ]
    }
   ],
   "source": [
    "#Method 1: For-Loop\n",
    "def square_even_for(arr):\n",
    "    result = []\n",
    "    for i in arr:\n",
    "        if i%2 == 0:\n",
    "            result.append(i**2)\n",
    "    return result\n",
    "print square_even_for(range(1,11))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[4, 16, 36, 64, 100]\n"
     ]
    }
   ],
   "source": [
    "#Method 2: Map Function\n",
    "def square_even_map(arr):\n",
    "    return filter(lambda x: x is not None,map(lambda x: x**2 if x%2==0 else None, arr))\n",
    "print square_even_map(range(1,11))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[4, 16, 36, 64, 100]\n"
     ]
    }
   ],
   "source": [
    "#Method 3: List comprehension:\n",
    "def square_even_lc(arr):\n",
    "    return [i**2 for i in arr if i%2==0]\n",
    "print square_even_lc(range(1,11))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It is clearly evident that with the slight increase in complexity, both for and map routines became bulkier and less readable. However, the LC routine is still concise and required a minor modification.\n",
    "\n",
    "Before going into more complex examples, let's try to appreciate another advantage of using LC - lower computational time!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Comparing run-times:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "Let us compare the time taken for each of the above functions to run. We'll be using the %timeit magic function of iPython notebook to determine the runtime. Alternatively, you can use the time or timeit modules. \n",
    "\n",
    "Now you will be able to appreciate the importance of writing each code fragment as a function. Also, we shall focus on the relative run times and not the absolute values because it is subject to the machine specs. FYI, I am using a Dell XPS 14Z system with following specs: \n",
    "2nd Gen i5 (2.5GHz) | 4GB RAM | 64-bit OS | Windows 7 Home Premium\n",
    "\n",
    "Let's compare the time for first example:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100000 loops, best of 3: 2.69 µs per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit square_for(range(1,11))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100000 loops, best of 3: 3.16 µs per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit square_map(range(1,11))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100000 loops, best of 3: 1.67 µs per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit square_lc(range(1,11))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "Here we can see that in this case LC is ~30% faster than for-loop and ~45% faster than map function.\n",
    "\n",
    "Let's check for the second example:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100000 loops, best of 3: 2.31 µs per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit square_even_for(range(1,11))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100000 loops, best of 3: 5.25 µs per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit square_even_map(range(1,11))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1000000 loops, best of 3: 1.74 µs per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit square_even_lc(range(1,11))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this case, LC is ~20% faster than for-loop and ~65% faster than map function.\n",
    "\n",
    "Now this is something incredible. Not only is LC more elegant but also faster than its counterparts. Yes, even I want to get into advanced applications of LC. But hang on! I am not convinced. Why is LC faster? Will it faster in all scenarios or are these special cases? Let's try to find out!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Why is LC fast?\n",
    "\n",
    "I would not doubt your intellectual skills at this point if you are still wondering why is LC faster. After all it's following the same process:\n",
    "1. Iterating over the list\n",
    "2. Modifying each element\n",
    "3. Storing the result\n",
    "\n",
    "Let's try to inspect each element one by one. Let's simply call a function that does nothing and check for iteration times:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1000000 loops, best of 3: 714 ns per loop\n"
     ]
    }
   ],
   "source": [
    "#Method 1: For-loop:\n",
    "def empty_for(arr):\n",
    "    for i in arr:\n",
    "        pass\n",
    "%timeit empty_for(range(1,11))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100000 loops, best of 3: 2.29 µs per loop\n"
     ]
    }
   ],
   "source": [
    "#Method 2: Map\n",
    "def empty_map(arr):\n",
    "    map(lambda x: None,arr)\n",
    "%timeit empty_map(range(1,11))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1000000 loops, best of 3: 1.18 µs per loop\n"
     ]
    }
   ],
   "source": [
    "#Method 3: LC\n",
    "def empty_lc(arr):\n",
    "    [None for i in arr]\n",
    "%timeit empty_lc(range(1,11))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we see that for-loop is fasters. This is because in a for-loop, we need not return an element and just move onto next iteration using \"pass\".\n",
    "In both LC and map, returning an element is necessary. The codes here return None. But still map takes more than twice the time. Intuitively, we can think that map involves a definite function call at each iteration which can be the reason behind the extra time.\n",
    "\n",
    "Now, lets perform a simple operation of multiplying the number by 2 but we need not store the result:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1000000 loops, best of 3: 1.07 µs per loop\n"
     ]
    }
   ],
   "source": [
    "#Method 1: For-loop:\n",
    "def x2_for(arr):\n",
    "    for i in arr:\n",
    "        i*2\n",
    "%timeit x2_for(range(1,11))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100000 loops, best of 3: 2.61 µs per loop\n"
     ]
    }
   ],
   "source": [
    "#Method 2: Map\n",
    "def x2_map(arr):\n",
    "    map(lambda x: x*2,arr)\n",
    "%timeit x2_map(range(1,11))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1000000 loops, best of 3: 1.44 µs per loop\n"
     ]
    }
   ],
   "source": [
    "#Method 3: LC\n",
    "def x2_lc(arr):\n",
    "    [i*2 for i in arr]\n",
    "%timeit x2_lc(range(1,11))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we see a similar trend as before. So till the point of iterating and making slight modifications, for-loop is clear winner.\n",
    "LC is close to for-loop but again map takes around twice as much time. Note that here the difference between time will also depend on the complexity of the function being applied to each element.\n",
    "\n",
    "Another intuition for higher time of map and LC can be that in both cases, it is compulsory to store information and we are actually performing all 3 steps for LC and map. So let's check runtime of for-loop with step 3:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100000 loops, best of 3: 2.55 µs per loop\n"
     ]
    }
   ],
   "source": [
    "def store_for(arr):\n",
    "    result=[]\n",
    "    for i in arr:\n",
    "        result.append(i*2)\n",
    "    return result\n",
    "%timeit store_for(range(1,11))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is interesting! So the runtime jumps to almost twice just because of storing the information. The reason being that we have to define an empty list and append the result to each in each iteration.\n",
    "\n",
    "After all 3 steps, LC seem to the clear winner. But are you 100% sure why? Not sure about you, but I am not convinced. My intuition says that probably map is slower because it has to make function calls at each step. LC might just be calculating the value of the same expression for all elements. \n",
    "\n",
    "We can quickly check this out. Let's make a function call in LC as well:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100000 loops, best of 3: 2.67 µs per loop\n"
     ]
    }
   ],
   "source": [
    "def x2_lc(arr):\n",
    "    def mul(x):\n",
    "        return x*2\n",
    "    [mul(i) for i in arr]\n",
    "%timeit x2_lc(range(1,11))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Aha! So the guess was right. When we force LC to make function calls, it ends up being more expensive than map function. \n",
    "\n",
    "So I guess the bottom line is that LC is faster in case where simple expressions are required to be applied to each element. But if complex functions are required, map and LC would be nearly the same. We can choose the one which works best.\n",
    "\n",
    "As promised, let's think of a slightly advanced application of LC:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using LC as generators:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def my_first_gen(n):\n",
    "    for i in range(n):\n",
    "        yield i"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<generator object my_first_gen at 0x0000000003DFFD38>\n"
     ]
    }
   ],
   "source": [
    "print my_first_gen(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3\n"
     ]
    }
   ],
   "source": [
    "gen = my_first_gen(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "StopIteration",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mStopIteration\u001b[0m                             Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-36-9e3acbf0bed2>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mprint\u001b[0m \u001b[0mgen\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnext\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;31mStopIteration\u001b[0m: "
     ]
    }
   ],
   "source": [
    "print gen.next()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def flow_of_info_gen(N):\n",
    "    print 'function runs for first time'\n",
    "    for i in range(N):\n",
    "        print 'execution before yielding value %d' % i\n",
    "        yield i\n",
    "        print 'execution after yielding value %d' % i\n",
    "    print 'function runs for last time'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "function runs for first time\n",
      "execution before yielding value 0\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gen2 = flow_of_info_gen(3)\n",
    "gen2.next()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "execution after yielding value 0\n",
      "execution before yielding value 1\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gen2.next()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "execution after yielding value 1\n",
      "execution before yielding value 2\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gen2.next()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "execution after yielding value 2\n",
      "function runs for last time\n"
     ]
    },
    {
     "ename": "StopIteration",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mStopIteration\u001b[0m                             Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-44-6bdc7f25d90f>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mgen2\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnext\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;31mStopIteration\u001b[0m: "
     ]
    }
   ],
   "source": [
    "gen2.next()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "39"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gen3 = my_first_gen(10)\n",
    "gen3.next()\n",
    "gen3.next()\n",
    "gen3.next()\n",
    "gen3.next()\n",
    "sum(gen3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#LC returning a list\n",
    "[x for x in range(10)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<generator object <genexpr> at 0x0000000003F7A630>"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#LC working as a generator\n",
    "(x for x in range(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "45"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum(x for x in range(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 loops, best of 3: 1.58 s per loop\n"
     ]
    }
   ],
   "source": [
    "def sum_list(N):\n",
    "    return sum([x for x in range(N)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 loops, best of 3: 1.42 s per loop\n"
     ]
    }
   ],
   "source": [
    "def sum_gen(N):\n",
    "    return sum((x for x in range(N)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Time for LC : 10000 loops, best of 3: 56.5 µs per loop\n",
      " \n",
      "Time for Generator : 10000 loops, best of 3: 59.9 µs per loop\n",
      "\n"
     ]
    }
   ],
   "source": [
    "N=1000\n",
    "print 'Time for LC : ',\n",
    "%timeit sum_list(N)\n",
    "print '\\nTime for Generator : ',\n",
    "%timeit sum_gen(N)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Time for LC : 100 loops, best of 3: 9.27 ms per loop\n",
      " \n",
      "Time for Generator : 100 loops, best of 3: 10.9 ms per loop\n",
      "\n"
     ]
    }
   ],
   "source": [
    "N=100000 #100K\n",
    "print 'Time for LC : ',\n",
    "%timeit sum_list(N)\n",
    "print '\\nTime for Generator : ',\n",
    "%timeit sum_gen(N)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Time for LC : 1 loops, best of 3: 1.77 s per loop\n",
      " \n",
      "Time for Generator : 1 loops, best of 3: 1.67 s per loop\n",
      "\n"
     ]
    }
   ],
   "source": [
    "N=10000000 #10Mn\n",
    "print 'Time for LC : ',\n",
    "%timeit sum_list(N)\n",
    "print '\\nTime for Generator : ',\n",
    "%timeit sum_gen(N)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Time for Generator : The slowest run took 52.48 times longer than the fastest. This could mean that an intermediate result is being cached \n",
      "1 loops, best of 3: 15.1 s per loop\n",
      " Time for LC : 1 loops, best of 3: 17min 45s per loop\n",
      "\n"
     ]
    }
   ],
   "source": [
    "N=100000000 #100Mn\n",
    "print '\\nTime for Generator : ',\n",
    "%timeit sum_gen(N)\n",
    "print 'Time for LC : ',\n",
    "%timeit sum_list(N)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Science Examples:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Example 4: Reading list of list:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    personID                            skills\n",
      "0          1      cricket;tabletennis;football\n",
      "1          2             tabletennis;badminton\n",
      "2          3                       tabletennis\n",
      "3          4    cricket;tabletennis;volleyball\n",
      "4          5               football;volleyball\n",
      "5          6              tabletennis;football\n",
      "6          7      cricket;volleyball;badminton\n",
      "7          8                  football;cricket\n",
      "8          9  badminton;tabletennis;volleyball\n",
      "9         10                cricket;volleyball\n",
      "10        11                football;badminton\n",
      "11        12       cricket;volleyball;football\n",
      "12        13               football;volleyball\n",
      "13        14                         badminton\n",
      "14        15      cricket;football;tabletennis\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "data = pd.read_csv(\"skills.csv\")\n",
    "print data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0         [cricket, tabletennis, football]\n",
      "1                 [tabletennis, badminton]\n",
      "2                            [tabletennis]\n",
      "3       [cricket, tabletennis, volleyball]\n",
      "4                   [football, volleyball]\n",
      "5                  [tabletennis, football]\n",
      "6         [cricket, volleyball, badminton]\n",
      "7                      [football, cricket]\n",
      "8     [badminton, tabletennis, volleyball]\n",
      "9                    [cricket, volleyball]\n",
      "10                   [football, badminton]\n",
      "11         [cricket, volleyball, football]\n",
      "12                  [football, volleyball]\n",
      "13                             [badminton]\n",
      "14        [cricket, football, tabletennis]\n",
      "Name: skills_list, dtype: object\n"
     ]
    }
   ],
   "source": [
    "#Split text with the separator ';'\n",
    "data['skills_list'] = data['skills'].apply(lambda x: x.split(';'))\n",
    "print data['skills_list']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "set(['badminton', 'cricket', 'football', 'tabletennis', 'volleyball'])\n"
     ]
    }
   ],
   "source": [
    "#Initialize the set\n",
    "skills_unq = set()\n",
    "#Update each entry into set. Since it takes only unique value, duplicates will be ignored automatically.\n",
    "skills_unq.update( (sport for l in data['skills_list'] for sport in l) )\n",
    "print skills_unq"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[0, 1, 1, 1, 0],\n",
       " [1, 0, 0, 1, 0],\n",
       " [0, 0, 0, 1, 0],\n",
       " [0, 1, 0, 1, 1],\n",
       " [0, 0, 1, 0, 1],\n",
       " [0, 0, 1, 1, 0],\n",
       " [1, 1, 0, 0, 1],\n",
       " [0, 1, 1, 0, 0],\n",
       " [1, 0, 0, 1, 1],\n",
       " [0, 1, 0, 0, 1],\n",
       " [1, 0, 1, 0, 0],\n",
       " [0, 1, 1, 0, 1],\n",
       " [0, 0, 1, 0, 1],\n",
       " [1, 0, 0, 0, 0],\n",
       " [0, 1, 1, 1, 0]]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Convert set to list:\n",
    "skills_unq = list(skills_unq)\n",
    "sport_matrix = [ [1 if skill in row else 0 for skill in skills_unq] for row in data['skills_list']  ]\n",
    "sport_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    personID                            skills  \\\n",
      "0          1      cricket;tabletennis;football   \n",
      "1          2             tabletennis;badminton   \n",
      "2          3                       tabletennis   \n",
      "3          4    cricket;tabletennis;volleyball   \n",
      "4          5               football;volleyball   \n",
      "5          6              tabletennis;football   \n",
      "6          7      cricket;volleyball;badminton   \n",
      "7          8                  football;cricket   \n",
      "8          9  badminton;tabletennis;volleyball   \n",
      "9         10                cricket;volleyball   \n",
      "10        11                football;badminton   \n",
      "11        12       cricket;volleyball;football   \n",
      "12        13               football;volleyball   \n",
      "13        14                         badminton   \n",
      "14        15      cricket;football;tabletennis   \n",
      "\n",
      "                             skills_list  badminton  cricket  football  \\\n",
      "0       [cricket, tabletennis, football]          0        1         1   \n",
      "1               [tabletennis, badminton]          1        0         0   \n",
      "2                          [tabletennis]          0        0         0   \n",
      "3     [cricket, tabletennis, volleyball]          0        1         0   \n",
      "4                 [football, volleyball]          0        0         1   \n",
      "5                [tabletennis, football]          0        0         1   \n",
      "6       [cricket, volleyball, badminton]          1        1         0   \n",
      "7                    [football, cricket]          0        1         1   \n",
      "8   [badminton, tabletennis, volleyball]          1        0         0   \n",
      "9                  [cricket, volleyball]          0        1         0   \n",
      "10                 [football, badminton]          1        0         1   \n",
      "11       [cricket, volleyball, football]          0        1         1   \n",
      "12                [football, volleyball]          0        0         1   \n",
      "13                           [badminton]          1        0         0   \n",
      "14      [cricket, football, tabletennis]          0        1         1   \n",
      "\n",
      "    tabletennis  volleyball  badminton  cricket  football  tabletennis  \\\n",
      "0             1           0          0        1         1            1   \n",
      "1             1           0          1        0         0            1   \n",
      "2             1           0          0        0         0            1   \n",
      "3             1           1          0        1         0            1   \n",
      "4             0           1          0        0         1            0   \n",
      "5             1           0          0        0         1            1   \n",
      "6             0           1          1        1         0            0   \n",
      "7             0           0          0        1         1            0   \n",
      "8             1           1          1        0         0            1   \n",
      "9             0           1          0        1         0            0   \n",
      "10            0           0          1        0         1            0   \n",
      "11            0           1          0        1         1            0   \n",
      "12            0           1          0        0         1            0   \n",
      "13            0           0          1        0         0            0   \n",
      "14            1           0          0        1         1            1   \n",
      "\n",
      "    volleyball  badminton  cricket  football  tabletennis  volleyball  \n",
      "0            0          0        1         1            1           0  \n",
      "1            0          1        0         0            1           0  \n",
      "2            0          0        0         0            1           0  \n",
      "3            1          0        1         0            1           1  \n",
      "4            1          0        0         1            0           1  \n",
      "5            0          0        0         1            1           0  \n",
      "6            1          1        1         0            0           1  \n",
      "7            0          0        1         1            0           0  \n",
      "8            1          1        0         0            1           1  \n",
      "9            1          0        1         0            0           1  \n",
      "10           0          1        0         1            0           0  \n",
      "11           1          0        1         1            0           1  \n",
      "12           1          0        0         1            0           1  \n",
      "13           0          1        0         0            0           0  \n",
      "14           0          0        1         1            1           0  \n"
     ]
    }
   ],
   "source": [
    "data = pd.concat([data, pd.DataFrame(sport_matrix,columns=skills_unq)],axis=1)\n",
    "print data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Eg5: Creating powers for Polynomial regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   number\n",
      "0       1\n",
      "1       2\n",
      "2       3\n",
      "3       4\n",
      "4       5\n"
     ]
    }
   ],
   "source": [
    "data2 = pd.DataFrame([1,2,3,4,5], columns=['number'])\n",
    "print data2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['power_2', 'power_3', 'power_4', 'power_5', 'power_6']\n"
     ]
    }
   ],
   "source": [
    "deg = 6\n",
    "cols = ['power_%d'%i for i in range(2,deg+1)]\n",
    "print cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[1, 1, 1, 1, 1],\n",
       " [4, 8, 16, 32, 64],\n",
       " [9, 27, 81, 243, 729],\n",
       " [16, 64, 256, 1024, 4096],\n",
       " [25, 125, 625, 3125, 15625]]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "power_matrix = [ [i**p for p in range(2,deg+1) ] for i in data2['number'] ]  \n",
    "power_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data2 = pd.concat([data2, pd.DataFrame(power_matrix,columns=cols)],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   number  power_2  power_3  power_4  power_5  power_6\n",
      "0       1        1        1        1        1        1\n",
      "1       2        4        8       16       32       64\n",
      "2       3        9       27       81      243      729\n",
      "3       4       16       64      256     1024     4096\n",
      "4       5       25      125      625     3125    15625\n"
     ]
    }
   ],
   "source": [
    "print data2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Eg6: Filtering columns:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "cols = ['a','b','c','d','a_transform','b_transform','c_transform','d_power2','d_power3','d_power4','d_power5','temp1','temp2']\n",
    "#Here a,b,c,d are original variables; transform are transformation, power are for polynomial reg, temp are intermediate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Select only variables with 'transform':\n",
    "col_set1 = [x for x in cols if x.endswith('transform')]\n",
    "col_set2 = [x for x in cols if 'power' in x]\n",
    "col_set3 = [x for x in cols if (x.endswith('transform')) | ('power' in x)]\n",
    "col_set4 = [x for x in cols if x not in ['temp1','temp2']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Set1:  ['a_transform', 'b_transform', 'c_transform']\n",
      "Set2:  ['d_power2', 'd_power3', 'd_power4', 'd_power5']\n",
      "Set3:  ['a_transform', 'b_transform', 'c_transform', 'd_power2', 'd_power3', 'd_power4', 'd_power5']\n",
      "Set4:  ['a', 'b', 'c', 'd', 'a_transform', 'b_transform', 'c_transform', 'd_power2', 'd_power3', 'd_power4', 'd_power5']\n"
     ]
    }
   ],
   "source": [
    "print 'Set1: ', col_set1\n",
    "print 'Set2: ', col_set2\n",
    "print 'Set3: ', col_set3\n",
    "print 'Set4: ', col_set4"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
